diff --git a/clients/gtest/known_bugs.yaml b/clients/gtest/known_bugs.yaml index fa7592d3f..5ce181dad 100644 --- a/clients/gtest/known_bugs.yaml +++ b/clients/gtest/known_bugs.yaml @@ -3,7 +3,13 @@ # Wildcards can be used for the function Known bugs: +- { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 5.0, alphai: 0.0, beta: 0.0, betai: 0.0 } +- { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 0.0, alphai: 0.0, beta: 3.0, betai: 0.0 } +- { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 1.0, alphai: 0.0, beta: 3.0, betai: 0.0 } +- { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 1.0, alphai: 0.0, beta: 1.0, betai: 0.0 } - { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 1024, N: 1024, K: 1024, lda: 1024, ldb: 1024, ldc: 1024, ldd: 1024, alpha: 5.0, alphai: 0.0, beta: 0.0, betai: 0.0 } - { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 1024, N: 1024, K: 1024, lda: 1024, ldb: 1024, ldc: 1024, ldd: 1024, alpha: 0.0, alphai: 0.0, beta: 3.0, betai: 0.0 } - { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 1024, N: 1024, K: 1024, lda: 1024, ldb: 1024, ldc: 1024, ldd: 1024, alpha: 1.0, alphai: 0.0, beta: 3.0, betai: 0.0 } - { function: "gemm_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 1024, N: 1024, K: 1024, lda: 1024, ldb: 1024, ldc: 1024, ldd: 1024, alpha: 1.0, alphai: 0.0, beta: 1.0, betai: 0.0 } +- { function: "gemm_strided_batched_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: -2.0, alphai: 0.0, beta: -3.0, betai: 0.0, batch_count: 3, stride_a: 262144, stride_b: 262144, stride_c: 262144, stride_d: 262144 } +- { function: "gemm_strided_batched_ex", a_type: "bf16_r", b_type: "bf16_r", c_type: "bf16_r", d_type: "bf16_r", compute_type: "f32_r", transA: 'C', transB: 'N', M: 512, N: 512, K: 512, lda: 512, ldb: 512, ldc: 512, ldd: 512, alpha: 0.0, alphai: 0.0, beta: 1.0, betai: 0.0, batch_count: 3, stride_a: 262144, stride_b: 262144, stride_c: 262144, stride_d: 262144 } diff --git a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml index 79e819a61..44b823d28 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml @@ -7,6 +7,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -30,6 +31,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -60,6 +62,7 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -150,6 +153,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -173,6 +177,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -232,6 +237,7 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -322,6 +328,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -345,6 +352,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -404,6 +412,7 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -494,6 +503,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -517,6 +527,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -576,6 +587,7 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -666,6 +678,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -689,6 +702,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -748,6 +762,7 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -838,6 +853,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -861,6 +877,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -906,59 +923,56 @@ WorkGroupMappingType: B _staggerStrideShift: 3 - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 8 + AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 3 - AssertSummationElementMultiple: 8 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 AssignedDerivedParameters: false AssignedProblemIndependentDerivedParameters: true - BufferLoad: true + BufferLoad: false BufferStore: true CheckDimOverflow: 0 CheckTensorDimAsserts: false - DepthU: 32 + DepthU: 16 DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr - ExpandPointerSwap: true - FractionalLoad: 0 - GlobalLoadVectorWidthA: 4 - GlobalLoadVectorWidthB: 4 + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 GlobalRead2A: true GlobalRead2B: true GlobalReadCoalesceGroupA: true GlobalReadCoalesceGroupB: true GlobalReadCoalesceVectorA: true GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 4 + GlobalReadVectorWidth: 1 GlobalSplitU: 1 GlobalSplitUSummationAssignmentRoundRobin: true GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 4 + GlobalWriteVectorWidth: 1 GuaranteeNoPartialA: true GuaranteeNoPartialB: true - ISA: [9, 0, 8] + ISA: [0, 0, 0] InnerUnroll: 1 InterleaveAlpha: 0 - KernelLanguage: Assembly - LSCA: 32 - LSCB: 32 - LSPA: 32 - LSPB: 32 - LVCA: 8 - LVCB: 8 - LVPA: 8 - LVPB: 8 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 LdcEqualsLdd: true - LdsNumElements: 14336 - LdsNumElementsAlignedA: 2048 - LdsNumElementsAlignedB: 4096 + LdsNumElements: 6144 LdsOffsetA: 0 - LdsOffsetA_Blk: 8192 LdsOffsetB: 2048 - LdsOffsetB_Blk: 10240 LdsPadA: 0 LdsPadB: 0 LocalDotLayout: 1 @@ -971,11 +985,11 @@ LocalWriteUseSgprB: false LoopDoWhile: false LoopTail: true - LoopUnroll: 32 - MacroTile0: 64 - MacroTile1: 128 - MacroTileA: 64 - MacroTileB: 128 + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 256 + MacroTileA: 128 + MacroTileB: 256 MacroTileShapeMax: 64 MacroTileShapeMin: 1 MaxOccupancy: 40 @@ -984,14 +998,14 @@ NonTemporalA: 0 NonTemporalB: 0 NonTemporalC: 0 - NumElementsPerThread: 32 - NumGlobalWriteVectorsPerThread: 8 - NumLoadsA: 2 - NumLoadsB: 4 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 8 + NumLoadsB: 16 NumLoadsCoalescedA: 1 NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 2 - NumLoadsPerpendicularB: 4 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 16 NumThreads: 256 OptNoLoadLoop: 1 PackBatchDims: 0 @@ -1006,7 +1020,7 @@ PerformanceWaitLocation: -1 PersistentKernel: 0 PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: true + PrefetchGlobalRead: false PrefetchLocalRead: true ProblemType: AssignedDerivedParameters: true @@ -1014,6 +1028,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -1037,6 +1052,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -1052,12 +1068,12 @@ UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] - ReplacementKernel: true + ReplacementKernel: false ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_BBH_MT64x128x32_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT128x256x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -1066,21 +1082,21 @@ SubGroupA: 16 SubGroupB: 16 SuppressNoLoadLoop: false - ThreadTile: [4, 8] - ThreadTile0: 4 - ThreadTile1: 8 - ThreadTileA: 4 - ThreadTileB: 8 + ThreadTile: [8, 16] + ThreadTile0: 8 + ThreadTile1: 16 + ThreadTileA: 8 + ThreadTileB: 16 UnrollMemFence: false - UseSgprForGRO: 1 + UseSgprForGRO: 0 Valid: true VectorAtomicWidth: 1 VectorStore: true - VectorWidth: 4 + VectorWidth: 1 WorkGroup: [16, 16, 1] WorkGroupMapping: 8 WorkGroupMappingType: B - _staggerStrideShift: 2 + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 1 AssertFree1ElementMultiple: 1 @@ -1096,6 +1112,7 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: false @@ -1186,6 +1203,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -1209,6 +1227,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -1253,6 +1272,181 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT16x64x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 8] + ThreadTile0: 2 + ThreadTile1: 8 + ThreadTileA: 2 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 1 @@ -1268,6 +1462,186 @@ DirectToLds: false DirectToLdsA: false DirectToLdsB: false + DisableAtomicFail: 0 + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 8] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + ConvolutionConfig: [] + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SetConstStrideB: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: true + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT64x128x32_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 8 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableAtomicFail: 0 DisableKernelPieces: 0 EdgeType: ShiftPtr ExpandPointerSwap: true @@ -1362,6 +1736,7 @@ ComplexConjugateA: false ComplexConjugateB: false ComputeDataType: 0 + ConvolutionConfig: [] DataType: 7 DestDataType: 7 HighPrecisionAccumulate: true @@ -1385,6 +1760,7 @@ NumIndicesSummation: 1 OperationType: GEMM SetConstStrideA: [] + SetConstStrideB: [] SilentHighPrecisionAccumulate: false TLUA: false TLUB: false @@ -1404,7 +1780,7 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 7 + SolutionIndex: 9 SolutionNameMin: Cijk_Alik_Bljk_BBH_MT32x32x32_SE_ StaggerU: 32 StaggerUMapping: 0 @@ -1430,130 +1806,158 @@ WorkGroupMappingType: B _staggerStrideShift: 2 - [2, 3, 0, 1] -- - - [959, 1024, 1, 1024] - - [3, 1055.49] - - - [960, 1023, 1, 1024] - - [3, 1071.67] - - - [960, 1024, 1, 1023] - - [3, 1069.84] - - - [960, 1024, 1, 1025] - - [3, 1077.07] - - - [960, 1025, 1, 1024] - - [2, 822.52] - - - [961, 1024, 1, 1024] - - [2, 813.489] - - - [1023, 1024, 1, 1024] - - [0, 865.973] - - - [1024, 1023, 1, 1024] - - [0, 865.805] - - - [1024, 1024, 1, 1023] - - [2, 866.42] - - - [1024, 1024, 1, 1025] - - [0, 865.43] +- - - [512, 512, 1, 513] + - [0, 381.611] + - - [511, 512, 1, 512] + - [0, 379.091] + - - [2048, 2049, 1, 2048] + - [1, 1383.87] + - - [480, 511, 1, 512] + - [0, 357.991] + - - [64, 1023, 1, 1024] + - [6, 111.903] + - - [2879, 3072, 1, 3072] + - [3, 1661.36] + - - [480, 512, 1, 511] + - [0, 358.482] + - - [4096, 4096, 1, 4096] + - [8, 16868.2] + - - [480, 513, 1, 512] + - [0, 359.311] + - - [2880, 3072, 1, 3073] + - [3, 1658.3] + - - [2880, 3071, 1, 3072] + - [3, 1661.59] + - - [3840, 4097, 1, 4096] + - [1, 1597.42] - - [1024, 1025, 1, 1024] - - [0, 866.658] + - [2, 857.747] + - - [512, 512, 1, 511] + - [0, 380.902] - - [1025, 1024, 1, 1024] - - [0, 866.155] - - - [2039, 2048, 1, 2048] - - [1, 1376.9] - - - [2040, 2047, 1, 2048] - - [1, 1378.17] - - - [2040, 2048, 1, 2047] - - [1, 1372.13] - - - [2040, 2048, 1, 2049] - - [1, 1387.02] - - - [2040, 2049, 1, 2048] - - [1, 1380.06] - - - [2041, 2048, 1, 2048] - - [1, 1379.05] + - [2, 859.448] + - - [959, 1024, 1, 1024] + - [3, 1067.68] + - - [480, 512, 1, 513] + - [0, 358.656] - - [2047, 2048, 1, 2048] - - [1, 1384.61] - - - [2048, 2047, 1, 2048] - - [1, 1383.95] - - - [2048, 2048, 1, 2047] - - [1, 1374.29] - - - [2048, 2048, 1, 2049] - - [1, 1391.34] - - - [2048, 2049, 1, 2048] - - [1, 1384.69] + - [1, 1384.19] + - - [479, 512, 1, 512] + - [0, 357.293] + - - [3840, 4096, 1, 4096] + - [8, 17340.1] - - [2049, 2048, 1, 2048] - - [1, 1383.0] - - - [2999, 3072, 1, 3072] - - [1, 1614.3] - - - [3000, 3071, 1, 3072] - - [1, 1614.82] - - - [3000, 3072, 1, 3071] - - [1, 1600.23] - - - [3000, 3072, 1, 3073] - - [1, 1608.61] - - - [3000, 3073, 1, 3072] - - [1, 1614.31] - - - [3001, 3072, 1, 3072] - - [1, 1616.45] - - - [3071, 3072, 1, 3072] - - [1, 1651.37] + - [1, 1383.81] + - - [960, 1023, 1, 1024] + - [3, 1069.66] + - - [480, 512, 1, 512] + - [8, 2545.11] + - - [1024, 1024, 1, 1025] + - [2, 860.273] + - - [64, 1024, 1, 1023] + - [6, 113.741] + - - [3072, 3072, 1, 3072] + - [8, 15664.8] + - - [960, 1025, 1, 1024] + - [2, 815.013] - - [3072, 3071, 1, 3072] - - [1, 1652.19] - - - [3072, 3072, 1, 3071] - - [1, 1636.33] + - [1, 1650.83] + - - [1920, 2047, 1, 2048] + - [5, 1758.25] + - - [1921, 2048, 1, 2048] + - [1, 1295.9] + - - [4096, 4097, 1, 4096] + - [1, 1604.45] + - - [960, 1024, 1, 1025] + - [3, 1078.27] + - - [961, 1024, 1, 1024] + - [0, 805.424] + - - [4096, 4095, 1, 4096] + - [1, 1607.6] + - - [512, 512, 1, 512] + - [8, 3445.01] - - [3072, 3072, 1, 3073] - - [1, 1645.1] + - [1, 1645.65] + - - [3071, 3072, 1, 3072] + - [1, 1650.57] + - - [2880, 3073, 1, 3072] + - [1, 1551.9] + - - [2048, 2047, 1, 2048] + - [1, 1383.25] + - - [2880, 3072, 1, 3072] + - [8, 17010.8] + - - [63, 1024, 1, 1024] + - [6, 109.939] + - - [1920, 2049, 1, 2048] + - [1, 1306.24] + - - [960, 1024, 1, 1024] + - [8, 13827.4] + - - [1920, 2048, 1, 2048] + - [8, 15206.0] + - - [4096, 4096, 1, 4097] + - [1, 1627.47] + - - [2048, 2048, 1, 2048] + - [8, 13584.8] + - - [3072, 3072, 1, 3071] + - [1, 1639.2] + - - [3840, 4095, 1, 4096] + - [4, 1782.78] + - - [3840, 4096, 1, 4097] + - [4, 1872.84] + - - [3841, 4096, 1, 4096] + - [1, 1592.57] + - - [2048, 2048, 1, 2047] + - [1, 1375.46] + - - [481, 512, 1, 512] + - [0, 359.439] + - - [3839, 4096, 1, 4096] + - [4, 1789.24] + - - [64, 1025, 1, 1024] + - [6, 112.921] - - [3072, 3073, 1, 3072] - - [1, 1651.42] - - - [3073, 3072, 1, 3072] - - [1, 1652.4] - - - [4079, 4096, 1, 4096] - - [1, 1600.96] - - - [4080, 4095, 1, 4096] - - [1, 1600.47] - - - [4080, 4096, 1, 4095] - - [4, 1604.11] - - - [4080, 4096, 1, 4097] - - [1, 1621.16] - - - [4080, 4097, 1, 4096] - - [1, 1602.61] - - - [4081, 4096, 1, 4096] - - [1, 1599.52] + - [1, 1650.31] + - - [1919, 2048, 1, 2048] + - [5, 1758.38] + - - [2881, 3072, 1, 3072] + - [1, 1553.64] - - [4095, 4096, 1, 4096] - - [1, 1604.71] - - - [4096, 4095, 1, 4096] - - [1, 1604.32] - - - [4096, 4096, 1, 4095] - - [4, 1609.64] - - - [4096, 4096, 1, 4097] - - [1, 1625.9] - - - [4096, 4097, 1, 4096] - - [1, 1605.8] + - [1, 1605.24] + - - [1024, 1023, 1, 1024] + - [0, 859.2] - - [4097, 4096, 1, 4096] - - [1, 1603.85] - - - [960, 1024, 1, 1024] - - [5, 9007.14] + - [1, 1605.11] + - - [3840, 4096, 1, 4095] + - [4, 1872.71] + - - [512, 511, 1, 512] + - [0, 379.349] + - - [3073, 3072, 1, 3072] + - [1, 1652.19] + - - [4096, 4096, 1, 4095] + - [4, 1608.39] + - - [1024, 1024, 1, 1023] + - [0, 864.574] - - [1024, 1024, 1, 1024] - - [5, 7604.43] - - - [2040, 2048, 1, 2048] - - [5, 11066.3] - - - [2048, 2048, 1, 2048] - - [5, 14117.1] - - - [3000, 3072, 1, 3072] - - [5, 12651.4] - - - [3072, 3072, 1, 3072] - - [5, 12416.1] - - - [4080, 4096, 1, 4096] - - [5, 16848.0] - - - [4096, 4096, 1, 4096] - - [5, 16911.1] - - - [63, 1024, 1, 1024] - - [6, 111.559] - - - [64, 1023, 1, 1024] - - [6, 113.219] - - - [64, 1024, 1, 1023] - - [6, 114.051] - - - [64, 1024, 1, 1025] - - [6, 114.243] - - - [64, 1025, 1, 1024] - - [6, 112.937] + - [8, 8505.59] + - - [512, 513, 1, 512] + - [0, 379.887] + - - [960, 1024, 1, 1023] + - [3, 1078.56] + - - [513, 512, 1, 512] + - [0, 379.458] - - [65, 1024, 1, 1024] - - [6, 109.143] + - [7, 105.664] + - - [64, 1024, 1, 1025] + - [6, 113.532] - - [64, 1024, 1, 1024] - - [7, 1762.34] + - [9, 967.544] + - - [1920, 2048, 1, 2047] + - [5, 1773.88] + - - [2880, 3072, 1, 3071] + - [3, 1659.92] + - - [1920, 2048, 1, 2049] + - [5, 1786.73] + - - [1023, 1024, 1, 1024] + - [0, 862.294] + - - [2048, 2048, 1, 2049] + - [1, 1388.52] - null